<?php 
class FlyCollection extends Action{	
	 private $cacheDir='';//缓存目录
	 private $note	=array();
	
    var $c_html = '';
    var $c_http_down = '';
	
    //兼容php5构造函数
    function __construct()
    {
        $this->c_http_down = $this->L("HttpDown");
        $this->c_html	   = $this->L("Html2");
    }
	
	//采集一个规则节点
	public function get_book_one(){
		$id=$this->_REQUEST("id");
		$sql="select * from fly_co_book where id='$id'";
		$one=$this->C($this->cacheDir)->findOne($sql);
		$listurl=$one["listurl"];
		
		//$url= $this->co_one_url($listurl,$one["listarea"],$one["listmustrule"],$one["listnorule"]);
		//$url= $this->get_html_area_href($listurl,$one["listarea"],$one["listrule"]);
		
		$rule=array(
					"title"=>array($one["title"],$one["title_trim"]),
					"writer"=>array($one["writer"],$one["writer_trim"]),
					"source"=>array($one["source"],$one["source_trim"]),
					"image"=>array($one["image"],$one["image_trim"]),
					"overs"=>array($one["overs"],$one["overs_trim"]),
					"chapter"=>array($one["chapter"],$one["chapter_trim"])
					);
		$body=$this->co_one_page($one["bodyurl"],$rule);
		print_r($body);
		//print_r($url);			
	}
	
	//采集一个网页内容
	function co_one_page($durl,$rule){
		$html=$this->get_one_page($durl);
		foreach($rule as $key=>$va){
			if(!empty($va[0])){
				$contxt=$this->get_html_feild($html,$va[0],$va[1]);
			}else{
				$contxt="";
			}
			$body[$key]=$contxt;
		}
		return $body;
	}
	
	
	//得到区域内容中的列表地址，并且补全
	function co_one_url($listurl,$regxrule,$listmustrule,$listnorule){
		$html	=$this->get_one_page($listurl); 
		$html	 =$this->get_html_area('【列表区域】',$regxrule,$html);//得到区域内容
	   $this->c_html->SetSource($html, $listurl, 'link');
		$rtn =array();
		$i	 =1;
		 foreach($this->c_html->Links as $key=>$row){
			if($listmustrule!=''){
				if( !preg_match("#".$listmustrule."#", $row['link']) ){
					continue;
				}
			}
			if($listnorule!=''){
				if( preg_match("#".$listnorule."#i", $row['link']) ){
					continue;
				}
			}
			$rtn[$i]["title"]=$row["title"];
			$rtn[$i]["image"]=$row["image"];
			$rtn[$i]["link"]=$row["link"];
			$i++;
		 }
		 return $rtn;
	}
	
    /**
     *  下载指定网址
     *
     * @access    public
     * @param    string  $dourl  下载地址
     */
    function get_one_page($dourl){
/*        $this->c_http_down->OpenUrl($dourl);
        $html = $this->c_http_down->GetHtml();
        $this->c_http_down->Close();*/
			$html = file_get_contents($dourl);
        $html = $this->change_code($html);
        return $html;
    }
	
	
	//采集区域列表地址
	function get_html_area_href($listurl,$regxrule,$listrule){
		$html	=$this->get_one_url($listurl); 
		$html	=$this->get_html_area('【列表区域】',$regxrule,$html);//得到区域内容	
		$listrule = str_replace("/", "\\/", trim($listrule));
		
		/*替换规则，获取标签所在位置*/
		$tag_arr   =array("【列表标题】","【列表地址】","【列表变量】");
		$tag_key   =$this->get_tag_key($tag_arr,$listrule);
		$list_rule =str_replace($tag_arr,"(.*)",$listrule);
		
		echo $list_rule;
		
		preg_match_all("/{$listrule}/is",$html,$arr);
		print_r($arr);
		exit;
		@$rtn=array(
			"title"=>$arr[$tag_key["【列表标题】"]],
			"links"=>$arr[$tag_key["【列表地址】"]],
			"var"=>$arr[$tag_key["【列表变量】"]]
		);
		return $rtn;
	}

	//获取一个字段的规则
	function get_html_feild($content,$regxrule,$regxtrim){
		$regxrule = str_replace("/", "\\/", $regxrule);
		/*替换规则，获取标签所在位置*/
		$tag_arr  =array("【内容区域】","【内容变量】");
		$tag_key  =$this->get_tag_key($tag_arr,$regxrule);
		$regxrule =str_replace($tag_arr,"(.*)",$regxrule);
		$content  =str_replace(array("\r\n", "\r", "\n"), "", $content);
		/*echo $content;*/
		//echo $regxrule;
		//echo "#{$regxrule}#isU <hr>";
		
		preg_match_all("#{$regxrule}#isU",$content,$arr);
		if(count($arr)>1){
			$body = $arr[$tag_key["【内容区域】"]][0];
			//存在过滤规则情况下过滤数据
			if($regxtrim){
				$body = $this->get_html_feild_trim($body,$regxtrim);
			}
		}else{
			$body="";	
		}
		return trim($body);
	}
	
	//过滤内容的正则
	function get_html_feild_trim($content,$regxtrim){
		$trim_arr=explode("\n",$regxtrim);
		foreach($trim_arr as $row){
			$row_arr   =explode("=>",$row);
			@$trim_reg  =rtrim(ltrim(trim($row_arr[0]), '【'), '】');
			@$repl_txt  =rtrim(ltrim(trim($row_arr[1]), '【'), '】');
			if($trim_reg){
				$regs	 = str_replace("/", "\\/", $trim_reg);		
				$content = preg_replace("#".$regs."#isU", $repl_txt, $content);
			}
		}
		return $content;
	}
	
	//获取标签字符串所在位置，并且对位置排序
	function get_tag_key($tag_arr,$listrule){
		foreach($tag_arr as $tag){
			$len 		=strpos($listrule,$tag);
			if(empty($len)){
				$len=9999;//表法没有找到
			}
			$sort[$len] =$tag;
		}
		ksort($sort);//整理规则位置，从低到高
		$tag_i=1;
		foreach($sort as $key=>$va){
			$tagsort[$va]=$tag_i;	
			$tag_i++;
		}	
		return $tagsort;	
	}
    // 解析地址
    function get_parse_url($uri)
    {
        $arr = $tmp = array();

        // query
        $x = array_pad( explode( '?', $uri ), 2, false );
        $arr['query'] = ( $x[1] )? $x[1] : '' ;

        // resource
        $x         = array_pad( explode( '/', $x[0] ), 2, false );
        $x_last = array_pop( $x );
        if( strpos( $x_last, '.' ) === false )
        {
            $arr['resource'] = '';
            $x[] = $x_last;
        }
        else
        {
            $arr['resource'] = $x_last;
            $tmp = @explode('.', $arr['resource']);
            $arr['file'] = @$tmp[0];
            $arr['ext'] = '.'.@$tmp[1];
        }

        // path    
        $arr['path'] = implode( '/', $x );
        if( substr( $arr['path'], -1 ) !== '/' ) $arr['path'] .= '/';

        // url
        $arr['url'] = $uri;
        return $arr;
    }
	
    /**
     *  获取特定区域的HTML
     *
     * @access    public
     * @param     string  $sptag  区域标记
     * @param     string  $areaRule  地址规则
     * @param     string  $html  html代码
     * @return    string
     */
    function get_html_area($sptag, $areaRule, $html)
    {
        //用正则表达式的模式匹配
        if('regex'=='regex')
        {
            $areaRule = str_replace("/", "\\/", $areaRule);
            $areaRules = explode($sptag, $areaRule);
            $arr = array();
            if($html==''||$areaRules[0]=='')
            {
                return '';
            }
            preg_match("#".trim($areaRules[0])."(.*)".trim($areaRules[1])."#isU", $html, $arr);
            return empty($arr[1]) ? '' : trim($arr[1]);
        }else{
            $areaRules = explode($sptag,$areaRule);
            if($html=='' || $areaRules[0]=='')
            {
                return '';
            }
            $posstart = @strpos($html,$areaRules[0]);
            if($posstart===FALSE)
            {
                return '';
            }
            $posstart = $posstart + strlen($areaRules[0]);
            $posend = @strpos($html,$areaRules[1],$posstart);
            if($posend > $posstart && $posend!==FALSE)
            {
                //return substr($html,$posstart+strlen($areaRules[0]),$posend-$posstart-strlen($areaRules[0]));
                return substr($html,$posstart,$posend-$posstart);
            }
            else
            {
                return '';
            }
        }
    }

	/**
	 *  补全网址
	 *
	 * @access    public
	 * @param     string  $refurl  来源地址
	 * @param     string  $surl  站点地址
	 * @return    string
	 */
	function get_full_url($refurl,$surl)
	{
		$i = $pathStep = 0;
		$dstr = $pstr = $okurl = '';
		$refurl = trim($refurl);
		$surl = trim($surl);
		$urls = @parse_url($refurl);
		$basehost = ( (!isset($urls['port']) || $urls['port']=='80') ? $urls['host'] : $urls['host'].':'.$urls['port']);
	
		//$basepath = $basehost.(!isset($urls['path']) ? '' : '/'.$urls['path']);
		//由于直接获得的path在处理 http://xxxx/nnn/aaa?fdsafd 这种情况时会有错误，因此用其它方式处理
		$basepath = $basehost;
		$paths = explode('/',preg_replace("/^http:\/\//i", "", $refurl));
		$n = count($paths);
		for($i=1;$i < ($n-1);$i++)
		{
			if(!preg_match("/[\?]/", $paths[$i])) $basepath .= '/'.$paths[$i];
		}
		if(!preg_match("/[\?\.]/", $paths[$n-1]))
		{
			$basepath .= '/'.$paths[$n-1];
		}
		if($surl=='')
		{
			return $basepath;
		}
		$pos = strpos($surl, "#");
		if($pos>0)
		{
			$surl = substr($surl, 0, $pos);
		}
	
		//用 '/' 表示网站根的网址
		if($surl[0]=='/')
		{
			$okurl = $basehost.$surl;
		}
		else if($surl[0]=='.')
		{
			if(strlen($surl)<=2)
			{
				return '';
			}
			else if($surl[1]=='/')
			{
				$okurl = $basepath.preg_replace('/^./', '', $surl);
			}
			else
			{
				$okurl = $basepath.'/'.$surl;
			}
		}
		else
		{
			if( strlen($surl) < 7 )
			{
				$okurl = $basepath.'/'.$surl;
			}
			else if( preg_match("/^http:\/\//i",$surl) )
			{
				$okurl = $surl;
			}
			else
			{
				$okurl = $basepath.'/'.$surl;
			}
		}
		$okurl = preg_replace("/^http:\/\//i", '', $okurl);
		$okurl = 'http://'.preg_replace("/\/{1,}/", '/', $okurl);
		return $okurl;
	}
	
	//自动转码
	function change_code($html){
		$char		=$this->L("Charset");
		$wcharset   = preg_match("/<meta.+?charset=[^\w]?([-\w]+)/i",$html,$temp) ? strtolower($temp[1]):""; 
		if(strpos('gbk,gbk231,gb2312',$wcharset)){
			$html	=$char->gb2utf8($html);
		}elseif(strpos('utf-8',$wcharset)){
			return $html;
		}	
		return $html;	
	}
	
	/*
	* 获取字符串中的图片
	* $str 一串字符
	* $f   是否把图片复制到本来，$f=1 复制图片到本地来
	*/
	function get_images($str,$f="",$host="",$path="/"){
		 if(preg_match("/<(IMG|img)(\s+|)(.*?)src(\s+|)=(\s+|)(“|‘|'|\"|)(\s+|)(.*?)(.jpg|.gif|.png|.bmp|.jpeg|.JPG|.GIF)(.*?)>/ies",$str,$rs)){

			 $imgpath=$rs[8].$rs[9];
			 if(strpos($imgpath,"http")!==0) {
				 //判断链接是否为相对目录 并计算出完整url
				 $path1=strpos($imgpath,"../",0);
				 while($path1!==false) {
					 if(substr($path,strlen($path)-1,strlen($path))=="/") $path=substr($path,0,strlen($path)-1);
					 $imgpath=substr($imgpath,strpos($imgpath,"../")+3,strlen($imgpath));     
					 $path=substr($path,0,strripos($path,"/"));
					 $path1=strpos($imgpath,"../",0);
				 }
				 if($path!='/') $imgpath = str_replace($path,"",$imgpath);
				 if(strpos($imgpath,'/')!==0) $imgpath= str_replace("//","/",$path."/".$imgpath);
				 $imgpath= str_replace("//","/",$imgpath);
				 $parseHost=parse_url($host);
				 $host=$parseHost["scheme"]."://".$parseHost["host"];
				 $imgpath=$host.$imgpath;
				 echo $imgpath;
			 }    

			 //将图片复制到本地
			 if($f=="1"){
				 $uppaths=CACHE."/images";
				 $dir=date("Y-m",time());
				 if(is_dir($uppaths)!=TRUE) mkdir($uppaths,0777);
				 if(is_dir($uppaths."/".$dir)!=TRUE) mkdir($uppaths."/".$dir,0777);
				 $name=substr($rs[8],(strripos($rs[8],"/",0)+1),strlen($rs[8]));  
				 $name=$uppaths."/".$dir."/".$name.$rs[9];    
				 if(copy($imgpath,$name));
				 $imgpath=$name;
			 }    

			 //$imgpath=str_replace($rs[8].$rs[9],$imgpath,$rs[0]);
			 //$imgpath=str_replace("<","{",$imgpath);
			 $imgpath=str_replace(CACHE,"",$imgpath);
			 return $imgpath;
			 //$str =str_replace($rs[0],$imgpath,$str);
			 //return $this->get_images($str,$f,$host,$path);
		}else{
		 	return "";
	  	}
	}
				
}//
?>